1. Data preprocessing
  1. Importing the dataset
library(readr)
training_data<- read_csv("C:/Users/Asus/Downloads/Training DataSet.csv")
## Rows: 102351 Columns: 13
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Gender, Licence_Type, Previously_Insured, Vehicle_Age, Vehicle_Damage
## dbl (8): id, Age, Driving_License, Region_Code, Annual_Premium, Policy_Sales_Channel, Seniority, Target
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(training_data)
install.packages(c("rmarkdown", "knitr", "htmltools", "xfun"), dependencies=TRUE, repos="https://cloud.r-project.org/")
## Error in install.packages : Updating loaded packages
  1. Checking the data types
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(training_data)
## Rows: 102,351
## Columns: 13
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,…
## $ Gender               <chr> "Female", "Male", "Male", "Male", "Female", "Female", "Male", "Male", "Fem…
## $ Age                  <dbl> 44, 23, 43, 60, 42, 40, 26, 28, 23, 33, 36, 23, 75, 48, 43, 48, 22, 38, 40…
## $ Driving_License      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Licence_Type         <chr> "M", "M", "C", "B", "B", "A", "D", "D", "B", "A", "C", "B", "B", "A", "M",…
## $ Region_Code          <dbl> 30, 3, 28, 28, 46, 8, 46, 28, 29, 30, 28, 3, 8, 46, 23, 28, 41, 28, 46, 36…
## $ Previously_Insured   <chr> "No", "Yes", "No", "Yes", "No", "No", "Yes", "No", "No", "No", "No", "Yes"…
## $ Vehicle_Age          <chr> "1-2 Year", "< 1 Year", "1-2 Year", "1-2 Year", "1-2 Year", "1-2 Year", "<…
## $ Vehicle_Damage       <chr> "Yes", "No", "Yes", "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "Yes", …
## $ Annual_Premium       <dbl> 25741, 2630, 70720, 48512, 32169, 34092, 22452, 36869, 34853, 27740, 45244…
## $ Policy_Sales_Channel <dbl> 154, 152, 52, 26, 124, 124, 152, 124, 152, 157, 26, 152, 78, 11, 26, 26, 1…
## $ Seniority            <dbl> 152, 256, 224, 86, 132, 285, 59, 60, 83, 254, 93, 201, 95, 52, 273, 152, 1…
## $ Target               <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,…

After checking the data types, based on the description the proper types should be set.If it is a categorical variable then factor should be set.

training_data$id <- NULL
factor_variables <- c("Gender","Driving_License","Licence_Type","Region_Code", "Previously_Insured", "Vehicle_Age","Vehicle_Damage","Target","Policy_Sales_Channel")
training_data[factor_variables] <- lapply(training_data[factor_variables], as.factor)
  1. Checking the missing values.
colSums(is.na(training_data))
##               Gender                  Age      Driving_License         Licence_Type          Region_Code 
##                    0                    0                    0                 5091                    0 
##   Previously_Insured          Vehicle_Age       Vehicle_Damage       Annual_Premium Policy_Sales_Channel 
##                    0                    0                    0                    0                    0 
##            Seniority               Target 
##                    0                    0

There are 5091 missing values in case of the licence type. There are more kinds of methods for imputing the missing values, however it can introduce biases, so we decided to delete them, since a lot of data will still remain.

training_data <- na.omit(training_data)
  1. Descriptive Analytics and relationship checking:

In this section, we analyze key numerical and categorical variables to understand their distributions and relationships. We explore customer demographics, vehicle characteristics, and past insurance behavior to identify potential patterns and factors influencing interest in insurance.

  1. Descriptive statistics
install.packages("htmltools", repos="https://cloud.r-project.org/")
## Error in install.packages : Updating loaded packages
library(dplyr)
library(htmltools)
## Warning: package 'htmltools' was built under R version 4.3.3
install.packages("rlang")
## Error in install.packages : Updating loaded packages
install.packages("gt")
## Error in install.packages : Updating loaded packages
library(gt)
## Warning: package 'gt' was built under R version 4.3.3
install.packages("xfun")
## Error in install.packages : Updating loaded packages
install.packages("skimr")
## Error in install.packages : Updating loaded packages
library(skimr)
## Warning: package 'skimr' was built under R version 4.3.3
summary_table <- skim(training_data %>% select(where(is.numeric))) %>%
  select(skim_variable, numeric.mean, numeric.sd, numeric.p25, numeric.p50, numeric.p75, complete_rate) %>%
  rename(Variable = skim_variable,
         Mean = numeric.mean,
         SD = numeric.sd,
         Q1 = numeric.p25,
         Median = numeric.p50,
         Q3 = numeric.p75,
         Complete_Rate = complete_rate) %>%
  gt() %>%  
  tab_header(title = "Descriptive Statistics of Numeric Variables") %>%
  fmt_number(columns = c(Mean, SD, Q1, Median, Q3), decimals = 2) %>%  
  cols_label(Q1 = "25th Percentile", Q3 = "75th Percentile", Complete_Rate = "Completeness") %>%
  tab_style(style = cell_borders(sides = "bottom", color = "black", weight = px(2)), locations = cells_column_labels()) %>%
  tab_options(table.font.size = 12)

summary_table
Descriptive Statistics of Numeric Variables
Variable Mean SD 25th Percentile Median 75th Percentile Completeness
Age 39.86 14.95 25.00 39.00 50.00 1
Annual_Premium 30,743.08 17,574.20 24,432.50 31,951.50 39,758.00 1
Seniority 154.58 83.71 82.00 155.00 227.00 1

In our analysis, we primarily focus on examining the Target variable, as it represents customer interest in purchasing insurance. By examining its relationship with other features, we aim to identify key factors that influence whether a customer is interested or not. Understanding these patterns can help optimize marketing strategies and customer targeting. Target is a categorical variable that shows whether the customer is interested in the product. The values can be interpreted as follows: 0: The customer is not interested in the product. 1: The customer is interested in the product.

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
categorical_vars <- training_data %>%
  select(where(is.factor)) %>%
  names()
categorical_vars <- setdiff(categorical_vars, c("Region_Code", "Policy_Sales_Channel"))

for (var in categorical_vars) {
  
category_data <- training_data %>%
  count(!!sym(var)) %>%
  mutate(percent = n / sum(n) * 100)
  
p <- ggplot(category_data, aes(x = "", y = n, fill = !!sym(var))) +
    geom_bar(stat = "identity", width = 1, color = "white", size = 0.5) +  
    coord_polar(theta = "y") +
    geom_text(aes(label = paste(n, " (", round(percent, 1), "%)", sep = "")),
              position = position_stack(vjust = 0.5), 
              color = "white",  
              fontface = "bold", 
              size = 3) +
    labs(title = paste("Distribution of ", var), x = NULL, y = NULL) +
    theme_void() +
    theme(
        plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "black"),
        legend.position = "right",  
        legend.title = element_text(size = 14, face = "bold"),  
        legend.text = element_text(size = 10),  
        plot.margin = margin(20, 20, 20, 20)  
    )
 
print(p)
}
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

The distribution is as follows: 0 (not interested): 68.3% of the data fall into this category, which means that more than two thirds of the customers are not interested in the product. 1 (Interested): 31.7% of the data fall into this category, i.e. over one third of the customers are interested in the product.


b.Relationship between the variables and the label

This plot illustrates how Gender is related to customer interest in purchasing insurance

ggplot(training_data, aes(x = Target, fill = Gender)) +
  geom_bar(position = "fill") +
  scale_x_discrete(labels = c("0" = "Not Interested", "1" = "Interested")) +
  labs(title = "Relationship between the gender and the label variable",
       x = "Target",
       y = "Ratio") +
  theme_minimal()

The plot shows that in the “Not Interested” category, males make up just over 50% of the sample, while in the “Interested” category, they make up about 60%. This suggests that the proportion of men is higher in the “Interested” group compared to the “Not Interested” group. Several factors could explain this difference: -Targeted Marketing: The vehicle insurance product may be more appealing to male customers, leading to a higher proportion of interested males. -Buying Habits: Men might be more inclined to show interest in new insurance products, while other demographic groups could be more cautious in purchasing. -Demographic Factors: The database may take into account demographic factors, such as age or geographic location, that influence interest in purchasing vehicle insurance.


The next plot shows the relationship between Licence Type and customer interest in the product (Target: 0 = Not Interested, 1 = Interested). It uses a stacked bar chart to display the proportions of interested and not interested customers for each licence type.

ggplot(training_data, aes(x = Target, fill = Licence_Type)) +
  geom_bar(position = "fill") +
  scale_x_discrete(labels = c("0" = "Not Interested", "1" = "Interested")) +
  labs(title = "Relationship between the licence type and the label variable",
       x = "Target",
       y = "Ratio") +
  theme_minimal()

The plot suggests that the licence type does not affect whether customers are interested in the insurance or not. In other words, for each licence type, there is a similar ratio of interested (1) and not interested (0) customers.


The next plot illustrates how the Previously Insured status of customers (whether they were previously insured or not) is related to their interest in purchasing insurance. It shows the ratio of interested vs. not interested customers for those who were previously insured versus those who were not.

ggplot(training_data, aes(x = Target, fill = Previously_Insured)) +
  geom_bar(position = "fill") +
  scale_x_discrete(labels = c("0" = "Not Interested", "1" = "Interested")) +
  labs(title = "Relationship between the previously insured and the label variable",
       x = "Target",
       y = "Ratio") +
  theme_minimal()

The plot shows that among those who are not interested, the distribution is roughly 50/50 between those who already has vehicle insurance and those who hasn’t. However, among the interested group, very few have already vehicle insurance. It’s important to note, though, that the interested individuals make up only 31% of the entire dataset.


The next plot illustrates how the Vehicle Age is related to customer interest in purchasing insurance. It shows the ratio of interested vs. not interested customers for each vehicle age category. This helps to understand if the age of the vehicle influences the likelihood of a customer being interested in the insurance offer.

ggplot(training_data, aes(x = Target, fill = Vehicle_Age)) +
  geom_bar(position = "fill") +
  scale_x_discrete(labels = c("0" = "Not Interested", "1" = "Interested")) +
  labs(title = "Relationship between the vehicle age and the label variable",
       x = "Target",
       y = "Ratio") +
  theme_minimal()

It appears that nearly half of the not interested group has a car that is less than two years old, and similarly, about half of them own a car that is between 1 and 2 years old. Around 5 % of them have a car older than two years. Among those who are interested, the proportion of customers with a car older than two years is higher, but nearly 75% of them own a car that is between 1 and 2 years old. One reason could be that individuals with older cars might be more inclined to consider car insurance, as they may be looking for better coverage or feel the need for additional protection. On the other hand, owners of newer cars may feel that their vehicles are already sufficiently insured or are still under warranty, which might reduce their perceived need for additional insurance.


The next plot illustrates how Vehicle Damage is related to customer interest in purchasing insurance. It shows the ratio of interested vs. not interested customers for each category of vehicle damage (yes or no).

ggplot(training_data, aes(x = Target, fill =Vehicle_Damage )) +
  geom_bar(position = "fill") +
  scale_x_discrete(labels = c("0" = "Not Interested", "1" = "Interested")) +
  labs(title = "Relationship between the vehicle damage and the label variable",
       x = "Target",
       y = "Ratio") +
  theme_minimal()

It appears that more than half of the not interested group has never had damage to their vehicle, while almost 100% of the interested group has previously experienced vehicle damage. Possible reasons: -Increased Awareness: People who’ve had damage to their vehicle are more aware of the importance of insurance and are more likely to be interested. -Perceived Risk: Those with past damage may feel more at risk and see insurance as necessary, while those without damage may feel less need for it. -Past Experience with Insurance: Individuals who’ve had vehicle damage may have experienced the benefits of insurance and are more likely to be interested in it again.


The next boxplot visualizes the distribution of Age for people who are interested and not interested in the insurance. It highlights the median, interquartile range (IQR), and potential outliers for each group. This can help identify if age significantly varies between interested and not interested customers.

ggplot(training_data, aes(x = Target, y = Age, fill = Target)) +
  geom_boxplot(alpha = 0.6) +
  scale_x_discrete(labels = c("0" = "Not Interested", "1" = "Interested")) +
  labs(title = "Boxplot of Age by Target",
       x = "Target",
       y = "Age",
       fill = "Target") +
  theme_minimal()

In the not interested group, the box (which represents the interquartile range, covering the middle 50% of the data) spans between ages 25 and 50, with a median around 35 years. In contrast, the interested group has ages between 35 and 50, with a median slightly above 40. Possible reasons: -Younger people (not interested) may feel they have fewer concerns about insurance or are less likely to think about it. -Older people (interested) might be more aware of the importance of insurance due to greater life experience or a greater perceived need for coverage.


The next plot displays the distribution of Annual Premium for customers categorized by their interest in the insurance

ggplot(training_data, aes(x = Target, y = Annual_Premium, fill = Target)) +
  geom_boxplot(alpha = 0.6) +
  scale_x_discrete(labels = c("0" = "Not Interested", "1" = "Interested")) +
  labs(title = "Boxplot of Annual premium by Target",
       x = "Target",
       y = "Age",
       fill = "Target") +
  theme_minimal()

Both the Not Interested and Interested groups have their boxplots at a relatively low level around 3,000 on the y-axis, indicating that the majority of customers in both groups have lower annual premiums. However, there are also outliers (points outside the whiskers) with very high premium values exceeding 50,000 in both groups. This suggests that while the overall distribution for both groups is similar, there are a few individuals with significantly higher premiums, which could be due to special cases such as high-value vehicles or additional coverage options. There doesn’t appear to be a significant difference between the two groups in terms of the annual premium, as both show similar patterns.


The next plot shows the distribution of Seniority (likely referring to the number of years a person has been in their job or with their current company) for the two groups based on interest in the insurance (Target variable).

ggplot(training_data, aes(x = Target, y = Seniority, fill = Target)) +
  geom_boxplot(alpha = 0.6) +
  scale_x_discrete(labels = c("0" = "Not Interested", "1" = "Interested")) +
  labs(title = "Boxplot of Seniority by Target",
       x = "Target",
       y = "Age",
       fill = "Target") +
  theme_minimal()

Both the Not Interested (0) and Interested (1) groups have their boxplots within a similar range, between 85 and 225 days. The median for both groups is slightly above 150 days, indicating that most customers, regardless of interest, have a similar tenure with the company. This suggests that Seniority does not appear to be a significant differentiator between the two groups in terms of interest in the insurance. Both groups show similar customer tenure.

  1. Corrleation matrix and heatmap
cor_matrix <- cor(training_data %>% select_if(is.numeric))
print(cor_matrix)
##                         Age Annual_Premium    Seniority
## Age            1.0000000000    0.080310212 0.0001506113
## Annual_Premium 0.0803102119    1.000000000 0.0049042963
## Seniority      0.0001506113    0.004904296 1.0000000000

The correlation matrix shows weak relationships between the variables:

-Age vs Annual Premium: A very weak positive correlation (0.08), suggesting age has a minimal effect on the annual premium. -Age vs Seniority: Almost no correlation (0.0002), meaning age is unrelated to how long a customer has been with the company. -Annual Premium vs Seniority: Extremely weak correlation (0.0049), indicating no significant relationship between the premium amount and customer tenure.

Overall, the correlations between these variables are very weak, meaning that Age, Annual Premium, and Seniority do not seem to have strong relationships with each other in this dataset.


install.packages("plotly")
## Installing package into 'C:/Users/Asus/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'plotly' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Asus\AppData\Local\Temp\RtmpiQmWZb\downloaded_packages
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
correlation_matrix <- cor(training_data %>% select_if(is.numeric))

colorscale_custom <- list(
  list(0, "#D73027"), 
  list(0.5, "#FFFFFF"),  
  list(1, "#4575B4")
)

highlight_matrix <- ifelse(correlation_matrix > 0.95 | correlation_matrix < -0.95, 2, 
                           ifelse(correlation_matrix > 0.9 | correlation_matrix < -0.9, 1, 0))

corr_plot <- plot_ly(
  x = colnames(correlation_matrix),
  y = rownames(correlation_matrix),
  z = correlation_matrix,
  type = "heatmap",
  colorscale = colorscale_custom,
  zmin = -1,
  zmax = 1
) %>% 
  add_annotations(
    x = rep(colnames(correlation_matrix), each = nrow(correlation_matrix)),
    y = rep(rownames(correlation_matrix), ncol(correlation_matrix)),
    text = ifelse(correlation_matrix > 0.9 | correlation_matrix < -0.9, 
                  round(correlation_matrix, 2), ""),
    showarrow = FALSE,
    font = list(color = "black", size = 12)
  ) %>% 
  layout(
    title = "Correlation Matrix")

corr_plot